In [11]:
# load the necessary python modules
import matplotlib.pyplot as plt
import matplotlib
import pickle
import pandas as pd
import numpy as np
from IPython.display import display
%matplotlib notebook
In [12]:
### Load the dictionary containing the dataset. This code taken from poi_id.py script provided by udacity.
with open("final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
In [13]:
# get some initial stats for the project report.
print("Total Number of persons: %d"%len(data_dict.keys()))
print("Total Number of features: %d"%len(list(data_dict.values())[0]))
print("Total Number of POIs: %d"%sum([1 if x['poi'] else 0 for x in data_dict.values()]))
In [14]:
print data_dict.keys()
In [15]:
# converting the dictionary dataset to a pandas dataframe
enron_df = pd.DataFrame.from_dict(data_dict)
# Removing entries belonging to Total and THE TRAVEL AGENCY IN THE PARK as they are non persons
del enron_df['TOTAL']
del enron_df['THE TRAVEL AGENCY IN THE PARK']
enron_df = enron_df.transpose()
enron_df_num = enron_df.apply(pd.to_numeric, errors='coerce')
# Removing the email_address from the dataset as it's non-numeric feature and won't seem to have much use right now.
del enron_df_num['email_address']
enron_df_num.describe()
Out[15]:
In [16]:
len(enron_df_num)
Out[16]:
We are left with 144 records now in our dataframe.
Also, the summary of the data sets shows some shows a very large standard deviation for some of the features and some missing data for others. We will drop some of these features as below.
In [17]:
del enron_df_num['loan_advances']
del enron_df_num['restricted_stock_deferred']
del enron_df_num['director_fees']
In [18]:
# Feature selections
data_corr_list = enron_df_num.corr()
print('\nCorrelations between features to POI:\n ' +str(data_corr_list['poi']))
Features ‘exercised_stock_options’, ‘total_stock_value’, and ‘bonus’ have the highest correlation to POI, in descending order.
In [19]:
#Get rid of label
del enron_df_num['poi']
poi = enron_df['poi']
#Create new features
enron_df_num['stock_sum'] = enron_df_num['exercised_stock_options'] +\
enron_df_num['total_stock_value'] +\
enron_df_num['restricted_stock']
enron_df_num['stock_ratio'] = enron_df_num['exercised_stock_options']/enron_df_num['total_stock_value']
enron_df_num['money_total'] = enron_df_num['salary'] +\
enron_df_num['bonus'] -\
enron_df_num['expenses']
enron_df_num['money_ratio'] = enron_df_num['bonus']/enron_df_num['salary']
enron_df_num['email_ratio'] = enron_df_num['from_messages']/(enron_df_num['to_messages']+enron_df_num['from_messages'])
enron_df_num['poi_email_ratio_from'] = enron_df_num['from_poi_to_this_person']/enron_df_num['to_messages']
enron_df_num['poi_email_ratio_to'] = enron_df_num['from_this_person_to_poi']/enron_df_num['from_messages']
#Feel in NA values with 'marker' value outside range of real values
enron_df_num = enron_df_num.fillna(enron_df_num.mean())
#Scale to 1-0
enron_df_num = (enron_df_num-enron_df_num.min())/(enron_df_num.max()-enron_df_num.min())
In [20]:
from sklearn.feature_selection import SelectKBest
selector = SelectKBest()
selector.fit(enron_df_num,poi.tolist())
scores = {enron_df_num.columns[i]:selector.scores_[i] for i in range(len(enron_df_num.columns))}
sorted_features = sorted(scores,key=scores.get, reverse=True)
for feature in sorted_features:
print('Feature %s has value %f'%(feature,scores[feature]))
In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.cross_validation import StratifiedShuffleSplit
import scipy
import warnings
warnings.filterwarnings('ignore')
gnb_clf = GridSearchCV(GaussianNB(),{})
#No params to tune for for linear bayes, use for convenience
svc_clf = SVC()
svc_search_params = {'C': scipy.stats.expon(scale=1),
'gamma': scipy.stats.expon(scale=.1),
'kernel': ['linear','poly','rbf'],
'class_weight':['balanced',None]}
svc_search = RandomizedSearchCV(svc_clf,
param_distributions=svc_search_params,
n_iter=25)
tree_clf = DecisionTreeClassifier()
tree_search_params = {'criterion':['gini','entropy'],
'max_leaf_nodes':[None,25,50,100,1000],
'min_samples_split':[2,3,4],
'max_features':[0.25,0.5,0.75,1.0]}
tree_search = GridSearchCV(tree_clf,
tree_search_params,
scoring='recall')
search_methods = [gnb_clf,svc_search,tree_search]
average_accuracies = [[0],[0],[0]]
average_precision = [[0],[0],[0]]
average_recall = [[0],[0],[0]]
num_splits = 10
train_split = 0.9
indices = list(StratifiedShuffleSplit(poi.tolist(),
num_splits,
test_size=1-train_split,
random_state=0))
best_features = None
max_score = 0
best_classifier = None
num_features = 0
for num_features in range(1,len(sorted_features)+1):
features = sorted_features[:num_features]
feature_df = enron_df_num[features]
for classifier_idx in range(3):
sum_values = [0,0,0]
#Only do parameter search once, too wasteful to do a ton
search_methods[classifier_idx].fit(feature_df.iloc[indices[0][0],:],
poi[indices[0][0]].tolist())
classifier = search_methods[classifier_idx].best_estimator_
for split_idx in range(num_splits):
train_indices, test_indices = indices[split_idx]
train_data = (feature_df.iloc[train_indices,:],poi[train_indices].tolist())
test_data = (feature_df.iloc[test_indices,:],poi[test_indices].tolist())
classifier.fit(train_data[0],train_data[1])
predicted = classifier.predict(test_data[0])
sum_values[0]+=accuracy_score(predicted,test_data[1])
sum_values[1]+=precision_score(predicted,test_data[1])
sum_values[2]+=recall_score(predicted,test_data[1])
avg_acc,avg_prs,avg_recall = [val/num_splits for val in sum_values]
average_accuracies[classifier_idx].append(avg_acc)
average_precision[classifier_idx].append(avg_prs)
average_recall[classifier_idx].append(avg_recall)
score = (avg_prs+avg_recall)/2
if score>max_score and avg_prs>0.3 and avg_recall>0.3:
max_score = score
best_features = features
best_classifier = search_methods[classifier_idx].best_estimator_
print('Best classifier found is %s \n\
with score (recall+precision)/2 of %f\n\
and feature set %s'%(str(best_classifier),max_score,best_features))
In [ ]: